{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Wrangle HaluEval\n",
    "\n",
    "Wrangles [HaluEval](https://github.com/RUCAIBox/HaluEval) QA data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from urllib.request import urlopen\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_colwidth\", None)\n",
    "\n",
    "data_url = \"https://raw.githubusercontent.com/RUCAIBox/HaluEval/main/data/qa_data.json\"\n",
    "records = []\n",
    "with urlopen(data_url) as url:\n",
    "    for line in url.readlines():\n",
    "        records.append(json.loads(line))\n",
    "df = pd.DataFrame(records)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.melt(\n",
    "    id_vars=[\"knowledge\", \"question\"],\n",
    "    value_vars=[\"right_answer\", \"hallucinated_answer\"],\n",
    "    var_name=\"answer_type\",\n",
    "    value_name=\"answer\",\n",
    ")\n",
    "df[\"is_hallucination\"] = df[\"answer_type\"] == \"hallucinated_answer\"\n",
    "df = df.drop(\"answer_type\", axis=1)\n",
    "df = df.sort_values([\"knowledge\", \"question\"]).reset_index(drop=True)\n",
    "df = df.rename(\n",
    "    columns={\"knowledge\": \"reference\", \"question\": \"query\", \"answer\": \"response\"},\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = \"halueval_qa_data.jsonl\"\n",
    "\n",
    "try:\n",
    "    os.remove(output_path)\n",
    "except OSError:\n",
    "    pass\n",
    "\n",
    "with open(output_path, \"a\") as f:\n",
    "    for record in df.to_dict(orient=\"records\"):\n",
    "        f.write(json.dumps(record) + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
